Load .RData
load('.RData')
load('trips_2015.RData')
trips_per_day_2015 <- trips_per_day_2015 %>%
left_join(holiday, by = "ymd") %>%
mutate(is_holiday = !(is.na(holiday_name)),
weekday = weekdays(as.POSIXct(ymd), abbreviate = T),
month = format(ymd,"%m"),
is_flu_season = month %in% flu_season) %>%
select(-holiday_name, -day_num, -month) %>%
rename(num_trips = trip_num)
trips_per_day_2015$tmin <-trips_per_day_2015$tmin/10
trips_per_day_2015$tmax <-trips_per_day_2015$tmax/10
test_R_square <- rsquare(model, trips_per_day_2015)
test_R_square
## [1] 0.7245831
rmse(model, trips_per_day_2015)
## [1] 8036.401
plot_test_data<- trips_per_day_2015 %>%
add_predictions(model)
ggplotly(ggplot(plot_test_data, aes(x= ymd, y = pred))+
geom_point(aes(y= num_trips))+
geom_line(aes(y=pred), color = "red")+
geom_point(aes(y=pred), color = "red") +
geom_smooth() +
xlab("Date") +
ylab("Predicted (in red)/ Actual (in black)")+
ggtitle("Number of trips at different dates"))
ggplot(plot_test_data, aes(x=pred, y =num_trips ))+
geom_point()+
geom_abline(linetype = "dashed") +
xlab('Predicted') +
ylab('Actual')
Number of trips in mid-September through early November were unusually high compare to 2014. This explains why the R^2 is 0.7245831 when fitting the model to 2015 data.
plot_train_data<- rbind(train_data,validate_data, test_data)
plot_train_data <- plot_train_data %>%
add_predictions(model) %>%
filter(ymd!="2014-04-30")
ggplotly(ggplot(plot_train_data, aes(x= ymd, y = pred))+
geom_point(aes(y= num_trips))+
geom_line(aes(y=pred), color = "red")+
geom_point(aes(y=pred), color = "red") +
geom_smooth() +
xlab("Date") +
ylab("Predicted (in red)/ Actual (in black)")+
ggtitle("Number of trips at different dates"))
ggplot(plot_train_data, aes(x=pred, y =num_trips ))+
geom_point()+
geom_abline(linetype = "dashed") +
xlab('Predicted') +
ylab('Actual')
rsquare(model, plot_train_data)
## [1] 0.9043868
rmse(model, plot_train_data)
## [1] 3156.534